package com.esri.json.hadoop;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.codehaus.jackson.JsonFactory;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.JsonParser;
import org.codehaus.jackson.JsonToken;
import org.codehaus.jackson.map.ObjectMapper;
/**
* Enumerates records from an Enclosed JSON file - use either Esri JSON or GeoJSON subclass
*/
public abstract class EnclosedBaseJsonRecordReader extends RecordReader<LongWritable, Text> implements
org.apache.hadoop.mapred.RecordReader<LongWritable, Text> {
static final Log LOG = LogFactory.getLog(EnclosedBaseJsonRecordReader.class.getName());
protected LongWritable mkey = null;
protected Text mval = null;
protected InputStream inputStream;
protected long splitLen = 0; // for getProgress
protected JsonParser parser;
protected EnclosedBaseJsonRecordReader() throws IOException {
mkey = createKey();
mval = createValue();
}
protected EnclosedBaseJsonRecordReader(org.apache.hadoop.mapred.InputSplit split,
Configuration conf) throws IOException {
org.apache.hadoop.mapred.FileSplit fileSplit = (org.apache.hadoop.mapred.FileSplit)split;
splitLen = fileSplit.getLength(); // using MRv1
commonInit(fileSplit.getPath(), conf);
}
@Override
public void close() throws IOException {
if (inputStream != null)
inputStream.close();
}
@Override
public LongWritable createKey() {
return new LongWritable();
}
@Override
public Text createValue() {
return new Text();
}
@Override
public LongWritable getCurrentKey() throws IOException, InterruptedException {
return mkey;
}
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
return mval;
}
@Override
public long getPos() throws IOException {
if (parser == null){
return 0;
} else {
return parser.getCurrentLocation().getCharOffset();
}
}
@Override
public float getProgress() throws IOException {
if (splitLen == 0 || parser == null) return 0;
return (float)parser.getCurrentLocation().getByteOffset() / splitLen;
}
@Override
public void initialize(InputSplit split, TaskAttemptContext taskContext)
throws IOException, InterruptedException {
FileSplit fileSplit = (FileSplit)split;
splitLen = fileSplit.getLength(); // using MRv2
commonInit(fileSplit.getPath(), taskContext.getConfiguration());
}
// Both Esri JSON and GeoJSON conveniently have "features"
@Override
public boolean next(LongWritable key, Text value) throws IOException {
JsonToken token;
// first call to nextKeyValue() so we need to create the parser and move to the
// feature array
if (parser == null) {
parser = new JsonFactory().createJsonParser(inputStream);
parser.setCodec(new ObjectMapper());
token = parser.nextToken();
while (token != null && !(token == JsonToken.START_ARRAY &&
parser.getCurrentName() != null && parser.getCurrentName().equals("features"))) {
token = parser.nextToken();
}
if (token == null) return false; // never found the features array
}
key.set(parser.getCurrentLocation().getCharOffset());
token = parser.nextToken();
// this token should be a start object with no name
if (token == null || !(token == JsonToken.START_OBJECT && parser.getCurrentName() == null))
return false;
JsonNode node = parser.readValueAsTree();
value.set(node.toString());
return true;
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
return next(mkey, mval);
}
private void commonInit(Path filePath, Configuration conf) throws IOException {
FileSystem fs = filePath.getFileSystem(conf);
inputStream = fs.open(filePath);
}
}